# Loading packages
library(readtext)
library(readxl)
library(dplyr)
library(tidyverse)
library(tidytext)
library(tm)
library(textstem)
library(wordcloud)
library(slam)
library(topicmodels)
library(SentimentAnalysis)
(…)
(…)
Federal Open Market Committee (FOMC) is the body of the central bank of United States (the Federal Reserve System). Its main duties is setting the national monetary policy. The FOMC makes all decisions regarding the federal funds rate, the size and composition of the Federal Reserve’s asset holdings, and communications with the public about the likely future course of monetary policy. The FOMC consists of 12 voting members: seven members of the Board of Governors, the president of the Federal Reserve Bank of New York and 4 of the remaining 11 Reserve Bank presidents, who serve one-year terms on a rotating basis. All 12 of the Reserve Bank presidents attend FOMC meetings and participate in FOMC discussions, but only the presidents who are Committee members at the time may vote on policy decisions. FOMC meetings typically are held eight times each year in Washington, D.C., and at other times as needed.
Immediately after each FOMC meeting, (…)
(…)
(…) https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm + algorytm skrapowania
(…)
# Loading scrapped statements (from 2006 to 2018)
# DATA_DIR <- "C:/Users/KAndr/OneDrive/Studia/II rok I semestr/Text mining/Text mining project/Statements/"
DATA_DIR = "C:/Users/esobolewska/Documents/FOMC-text-mining/Statements"
fomc_2006 <- readtext(paste0(DATA_DIR, "/2006/*"))
fomc_2007 <- readtext(paste0(DATA_DIR, "/2007/*"))
fomc_2008 <- readtext(paste0(DATA_DIR, "/2008/*"))
fomc_2009 <- readtext(paste0(DATA_DIR, "/2009/*"))
fomc_2010 <- readtext(paste0(DATA_DIR, "/2010/*"))
fomc_2011 <- readtext(paste0(DATA_DIR, "/2011/*"))
fomc_2012 <- readtext(paste0(DATA_DIR, "/2012/*"))
fomc_2013 <- readtext(paste0(DATA_DIR, "/2013/*"))
fomc_2014 <- readtext(paste0(DATA_DIR, "/2014/*"))
fomc_2015 <- readtext(paste0(DATA_DIR, "/2015/*"))
fomc_2016 <- readtext(paste0(DATA_DIR, "/2016/*"))
fomc_2017 <- readtext(paste0(DATA_DIR, "/2017/*"))
fomc_2018 <- readtext(paste0(DATA_DIR, "/2018/*"))
# Binding data
statements <- rbind(fomc_2006, fomc_2007, fomc_2008, fomc_2009, fomc_2010, fomc_2011,
fomc_2012, fomc_2013, fomc_2014, fomc_2015, fomc_2016, fomc_2017, fomc_2018)
# Removing files from memory
remove(fomc_2006, fomc_2007, fomc_2008, fomc_2009, fomc_2010, fomc_2011,
fomc_2012, fomc_2013, fomc_2014, fomc_2015, fomc_2016, fomc_2017, fomc_2018)
# Initial preprocessing
statements <- statements %>% mutate(ID = 1:n())
colnames(statements) <- c("Date", "Text", "ID")
statements$Date <- gsub(".txt", "", statements$Date)
statements$Date <- as.Date(statements$Date, "%Y%m%d ")
statements_all <- as.vector(statements$Text)
length(statements_all) # 107 documents
## [1] 107
# Converting documents into corpus
(corpus_all <- VCorpus(VectorSource(statements_all)))
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 107
inspect(corpus_all[[1]])
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 778
##
## The Federal Open Market Committee decided today to raise its target for the federal funds rate by 25 basis points to 4-1/2 percent. Although recent economic data have been uneven, the expansion in economic activity appears solid. Core inflation has stayed relatively low in recent months and longer-term inflation expectations remain contained. Nevertheless, possible increases in resource utilization as well as elevated energy prices have the potential to add to inflation pressures. The Committee judges that some further policy firming may be needed to keep the risks to the attainment of both sustainable economic growth and price stability roughly in balance. In any event, the Committee will respond to changes in economic prospects as needed to foster these objectives.
as.character(corpus_all[[1]])
## [1] "The Federal Open Market Committee decided today to raise its target for the federal funds rate by 25 basis points to 4-1/2 percent. Although recent economic data have been uneven, the expansion in economic activity appears solid. Core inflation has stayed relatively low in recent months and longer-term inflation expectations remain contained. Nevertheless, possible increases in resource utilization as well as elevated energy prices have the potential to add to inflation pressures. The Committee judges that some further policy firming may be needed to keep the risks to the attainment of both sustainable economic growth and price stability roughly in balance. In any event, the Committee will respond to changes in economic prospects as needed to foster these objectives. "
stopwords <- stopwords("en")
system.time (
corpus_clean <- corpus_all %>%
tm_map(tolower) %>%
tm_map(removeWords, stopwords) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(stripWhitespace) %>%
tm_map(PlainTextDocument)
)
## user system elapsed
## 0.25 0.00 0.27
# example statement after cleaning
as.character(corpus_clean[[1]])
## [1] " federal open market committee decided today raise target federal funds rate basis points percent although recent economic data uneven expansion economic activity appears solid core inflation stayed relatively low recent months longerterm inflation expectations remain contained nevertheless possible increases resource utilization well elevated energy prices potential add inflation pressures committee judges policy firming may needed keep risks attainment sustainable economic growth price stability roughly balance event committee will respond changes economic prospects needed foster objectives "
# example statement after cleaning
as.character(corpus_clean[[2]])
## [1] " federal open market committee decided today raise target federal funds rate basis points percent slowing growth real gdp fourth quarter seems largely reflected temporary special factors economic growth rebounded strongly current quarter appears likely moderate sustainable pace yet run prices energy commodities appears modest effect core inflation ongoing productivity gains helped hold growth unit labor costs check inflation expectations remain contained still possible increases resource utilization combination elevated prices energy commodities potential add inflation pressures committee judges policy firming may needed keep risks attainment sustainable economic growth price stability roughly balance event committee will respond changes economic prospects needed foster objectives"
df_corpus <- data.frame(text = unlist(sapply(corpus_clean, `[`, "content")), stringsAsFactors = F)
df_corpus <- df_corpus %>% mutate(doc_id = 1:n())
statements_clean <- statements %>%
mutate(cleaned_text = df_corpus$text)
count_cleaned_word <- statements_clean %>%
unnest_tokens(word_count, cleaned_text) %>%
count(ID, word_count, sort = T) %>%
group_by(ID) %>%
summarize(word_cleaned_count = sum(n))
statements_clean_count <- left_join(statements_clean, count_cleaned_word, by = 'ID')
count_word <- statements_clean_count %>%
unnest_tokens(word_count, Text) %>%
count(ID, word_count, sort = T) %>%
group_by(ID) %>%
summarize (word_count = sum(n))
statements_final <- left_join(statements_clean_count, count_word, by = 'ID')
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
myplot <- statements_final %>%
select(word_count, Date) %>%
ggplot() +
geom_line(aes(x=Date, y= word_count)) +
theme_minimal()
ggplotly(myplot)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
statements_words <- statements_clean_count %>%
mutate(year = year(Date)) %>%
unnest_tokens(word_count, Text) %>%
count(year, word_count, sort = T)
statements_words
## # A tibble: 4,789 x 3
## year word_count n
## <dbl> <chr> <int>
## 1 2014 the 411
## 2 2013 the 344
## 3 2015 the 288
## 4 2016 the 281
## 5 2017 the 266
## 6 2011 the 236
## 7 2012 the 221
## 8 2014 and 202
## 9 2014 of 199
## 10 2013 and 195
## # ... with 4,779 more rows
statements_words <- statements_words %>%
bind_tf_idf(word_count, year, n)
statements_words
## # A tibble: 4,789 x 6
## year word_count n tf idf tf_idf
## <dbl> <chr> <int> <dbl> <dbl> <dbl>
## 1 2014 the 411 0.0696 0 0
## 2 2013 the 344 0.0657 0 0
## 3 2015 the 288 0.0674 0 0
## 4 2016 the 281 0.0675 0 0
## 5 2017 the 266 0.0696 0 0
## 6 2011 the 236 0.0685 0 0
## 7 2012 the 221 0.0638 0 0
## 8 2014 and 202 0.0342 0 0
## 9 2014 of 199 0.0337 0 0
## 10 2013 and 195 0.0372 0 0
## # ... with 4,779 more rows
pd = statements_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word_count, levels = rev(unique(word_count)))) %>%
group_by(year) %>%
top_n(10) %>%
ungroup() %>%
arrange(year, tf_idf) %>%
mutate(order = row_number())
## Selecting by word
ggplot(pd, aes(order, tf_idf, fill = year)) +
geom_bar(show.legend = FALSE, stat = "identity") +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~year, ncol = 3, scales = "free") +
scale_x_continuous(
breaks = pd$order,
labels = pd$word,
expand = c(0,0)) +
coord_flip()
library(wordcloud)
dtm <- TermDocumentMatrix(corpus_clean)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
## word freq
## committee committee 915
## inflation inflation 834
## will will 638
## economic economic 556
## market market 446
## federal federal 445
## rate rate 418
## labor labor 331
## conditions conditions 326
## securities securities 314
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
# Lemmatization
statements_final$lemma_text <- lemmatize_strings(statements_final$cleaned_text)
# Tokenization
tokens <- statements_final %>%
unnest_tokens(word, lemma_text)
# topic modelling - do poprawy na pewno, bo słabo wychodzi
# install.packages("topicmodels")
# Creating a Term document Matrix
tdm = DocumentTermMatrix(corpus_clean)
# create tf-idf matrix
term_tfidf <- tapply(tdm$v/row_sums(tdm)[tdm$i], tdm$j, mean) * log2(nDocs(tdm)/col_sums(tdm > 0))
summary(term_tfidf)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.01091 0.01977 0.02410 0.02952 0.16749
tdm <- tdm[,term_tfidf >= 0.05]
tdm <- tdm[row_sums(tdm) > 0,]
summary(col_sums(tdm))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 1.00 2.33 2.00 36.00
# finding best K
best.model <- lapply(seq(2, 50, by = 1), function(d){LDA(tdm, d)})
best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, logLik)))
# calculating LDA
k = 5 # number of topics
SEED = 112 # number of documents
CSC_TM <-list(VEM = LDA(tdm, k = k,
control = list(seed = SEED)),
VEM_fixed = LDA(tdm, k = k,
control = list(estimate.alpha = FALSE, seed = SEED)),
Gibbs = LDA(tdm, k = k, method = "Gibbs",
control = list(seed = SEED, burnin = 1000, thin = 100, iter = 1000)),
CTM = CTM(tdm, k = k,
control = list(seed = SEED,
var = list(tol = 10^-4),
em = list(tol = 10^-3))))
sapply(CSC_TM[1:2], slot, "alpha")
## VEM VEM_fixed
## 0.02486956 10.00000000
sapply(CSC_TM, function(x) mean(apply(posterior(x)$topics, 1, function(z) sum(z*log(z)))))
## VEM VEM_fixed Gibbs CTM
## -0.2206000 -1.5846064 -1.5947876 -0.5421278
Topic <- topics(CSC_TM[["VEM"]], 1)
Terms <- terms(CSC_TM[["VEM"]], 8)
Terms
## Topic 1 Topic 2 Topic 3 Topic 4
## [1,] "demandnonetheless" "tslf" "bank" "deepening"
## [2,] "impetus" "growthinflation" "central" "mitigate"
## [3,] "arise" "spurred" "arrangements" "carefullytodayâ\200\231s"
## [4,] "carefullyâ" "consult" "swiss" "remainâ"
## [5,] "closed" "dealers" "european" "timely"
## [6,] "forestall" "percentstrains" "funding" "central"
## [7,] "funding" "pricesreadings" "dollar" "cuts"
## [8,] "operations" "softening" "ecb" "downturnâ"
## Topic 5
## [1,] "yet"
## [2,] "fail"
## [3,] "predominant"
## [4,] "adjustment"
## [5,] "moderation"
## [6,] "check"
## [7,] "combination"
## [8,] "convincingly"